import numpy as np
import pandas as pd
from dask.array import outer

brand_file_path = '../../data/raw data/餐饮连锁品牌数据.xlsx'
cater_file_path = '../../data/raw data/餐饮连锁数据.xlsx'
sheet_names=['门店信息','菜品信息','营销记录','顾客评价']
sheet_name=sheet_names[1]
df_dish = pd.read_excel(cater_file_path, sheet_name)
print('=============================')
# ======================================
# 4️⃣ 识别并修正异常值
# ======================================

# --- 测试3：查看数值列的分布 ---
print("\n【数值列统计描述】")
print(df_dish.describe())

# --- 示例：处理异常的“销售额”列 ---
outer='口味评分'
if outer in df_dish.columns:
    mean_sales = df_dish[outer].mean()
    # 计算销售额的标准差（用于异常值检测）
    std_sales = df_dish[outer].std()
    # 定义异常值的阈值（3倍标准差）
    upper = mean_sales + 3 * std_sales# 上边界
    lower = mean_sales - 3 * std_sales# 下边界

    # 标记异常值
    outliers = df_dish[(df_dish[outer] > upper) | (df_dish[outer] < lower)]
    print(f"\n检测到异常值数量：{len(outliers)}")

    # 替换异常值为中位数
    median_sales = df_dish[outer].median()
    df_dish.loc[(df_dish[outer] > upper) | (df_dish[outer] < lower), outer] = median_sales

